---
title: "The Gender Gap in Peace and Conflict Journals, 2000–2024"
subtitle: "Replication Code: Author Gender Analysis"
author:
  - Daina Chiba
  - Wakako Maekawa
date: today
format:
  html: 
    toc: true
    page-layout: full
    embed-resources: true
code-fold: false
execute:
  message: false
  warning: false
---

# Preparation

Install and load necessary packages.

```{r}
## Install pacman if not already installed
if (!require("pacman")) install.packages("pacman")

## Unload any loaded packages
pacman::p_unload(pacman::p_loaded(), character.only = TRUE)

## Delete everything in the memory
rm(list=ls(all=TRUE))

## Load packages
pacman::p_load(tidyverse, knitr, stargazer, ggeffects, viridis, scales)
```

Define colors

```{r}
## Define colors
col_m <- "#5DC863FF"
col_f <- "#440154FF"
col_u <- rgb(205, 205, 205, maxColorValue = 255)
```

Load data

```{r}
df_all <- readRDS(file = "data_author_gender.rds")
df.cmps <- df_all |> filter(journal == "CMPS")
df.ii <- df_all |> filter(journal == "II")
df.isq <- df_all |> filter(journal == "ISQ")
df.jcr <- df_all |> filter(journal == "JCR")
df.jpr <- df_all |> filter(journal == "JPR")
```

# Article-Level Analysis

Drop articles with authors of unknown gender.

```{r}
df_nna <- df_all |> 
  filter(unknown_num == 0) |> 
  mutate(single = if_else(author_num == 1, "Single", "Coauthor"), 
         gender = if_else(per_male == 1, "Male", NA),
         gender = if_else(per_female == 1, "Female", gender),
         gender = if_else(per_male > 0 & per_female > 0, "Mixed", gender)
         )

with(df_nna, table(gender, single, useNA = "ifany"))
```

Ensure correct ordering of categorical variables.

```{r}
df_nna$gender <- factor(df_nna$gender, levels = c("Female", "Mixed", "Male"))
df_nna$single <- factor(df_nna$single, levels = c("Single", "Coauthor"))
```

Define author_cat.

```{r}
df_nna <- df_nna |> mutate(
  author_cat = if_else(gender == "Male" & single == "Single", "Solo male", NA),
  author_cat = if_else(gender == "Female" & single == "Single", "Solo female", author_cat),
  author_cat = if_else(gender == "Male" & single == "Coauthor", "All male", author_cat),
  author_cat = if_else(gender == "Female" & single == "Coauthor", "All female", author_cat),
  author_cat = if_else(gender == "Mixed", "Mixed team", author_cat),
  author_cat = factor(author_cat, 
      levels = c("Solo male", "All male", "Mixed team","All female", "Solo female"))
)

table(df_nna $ author_cat)
```

Basic summary statistics

```{r}
df_nna |> 
  group_by(single) |> 
  summarise(per_female = mean(per_female))
```

```{r}
df_nna |> 
  group_by(single) |> 
  summarise(per_female = mean(per_female))
```

```{r}
df_nna |> 
  filter(single == "Single") |> 
  summarise(author_num = sum(author_num), 
            female_num = sum(female_num), 
            per_female = female_num/author_num)
```

```{r}
df_nna |> 
  filter(single == "Coauthor") |> 
  summarise(author_num = sum(author_num), 
            female_num = sum(female_num), 
            per_female = female_num/author_num)
```

```{r}
df_nna |> 
  filter(author_num == 1) |> 
  summarise(author_num = sum(author_num), 
            female_num = sum(female_num), 
            per_female = female_num/author_num)
```

```{r}
df_nna |> 
  filter(author_num == 2) |> 
  summarise(author_num = sum(author_num), 
            female_num = sum(female_num), 
            per_female = female_num/author_num)
```

```{r}
df_nna |> 
  filter(author_num > 2) |> 
  summarise(author_num = sum(author_num), 
            female_num = sum(female_num), 
            per_female = female_num/author_num)
```


Prepare the data for plotting

```{r}
plot_data <- df_nna |> 
  count(journal, year_v, author_cat)

plot_all <- df_nna |> 
  count(year_v, author_cat) |> 
  mutate(journal = "All")

plot_data <- bind_rows(plot_data, plot_all) |> 
  mutate(journal = factor(journal, 
                          levels = c("All", "ISQ", "JCR", 
                                     "JPR", "CMPS", "II")))
```

## Yearly trend

```{r}
ggplot(plot_data, aes(x = year_v, y = n, fill = author_cat)) + 
  geom_bar(stat = "identity", position = "fill") + 
  scale_fill_viridis_d(direction = -1) + 
  labs(x = "Year", 
       y = "Ratio",
       fill = NULL) + 
  theme_minimal() + 
  facet_wrap(~ journal, ncol = 3) + 
  theme(legend.position = "bottom", 
        axis.text.x = element_text(angle = 90))
```


## Five-Year Bin

```{r}
df_nna <- df_nna %>%
  mutate(
    interval = case_when(
      year_v >= 2000 & year_v < 2005 ~ "2000-\n 2004",
      year_v >= 2005 & year_v < 2010 ~ "2005-\n 2009",
      year_v >= 2010 & year_v < 2015 ~ "2010-\n 2014",
      year_v >= 2015 & year_v < 2020 ~ "2015-\n 2019",
      year_v >= 2020 ~ "2020-\n 2024"
    )
  )
with(df_nna, table(interval, year_v))

plot_five <- df_nna |> 
  count(journal, interval, author_cat)

plot_all <- df_nna |> 
  count(interval, author_cat) |> 
  mutate(journal = "All")

plot_five <- bind_rows(plot_five, plot_all) |> 
  mutate(journal = factor(journal, 
                          levels = c("All", "ISQ", "JCR", 
                                     "JPR", "CMPS", "II")))
```

## Figure 2 in the paper

```{r}
ggplot(plot_five, aes(x = interval, y = n, fill = author_cat)) + 
  geom_bar(stat = "identity", position = "fill") + 
  scale_fill_viridis_d(direction = -1) + 
  scale_y_continuous(labels = scales::percent) + 
  labs(x = "Year of publication", 
       fill = NULL,
       y = NULL) + 
  theme_minimal() + 
  facet_wrap(~ journal, ncol = 3) + 
  theme(legend.position = "top", 
        axis.text.x = element_text(angle = 0, size = 9),
        axis.text.y = element_text(size = 11),
        strip.text = element_text(size = 15),
        axis.title = element_text(size = 17),
        legend.title = element_text(size = 17),
        legend.text = element_text(size = 15)
        )
```



```{r}
plot_five |> filter(journal == "All" & interval == "2020-\n 2024")
393/(393+416+494+100+206)
206/(393+416+494+100+206)
```

```{r}
plot_five |> filter(journal == "All" & interval == "2000-\n 2004")
302/(302+182+89+11+54)
54/(302+182+89+11+54)
```



# Author-Level Analysis

## Basic statistics

### CMPS

Total number of male authors

```{r}
sum(df.cmps $ male_num)
```

Total number of female authors

```{r}
sum(df.cmps $ female_num)
```

Ratio

```{r}
sum(df.cmps$female_num)/sum(df.cmps$author_num)
```

```{r}
# Aggregate at year level
ag <- aggregate(author_num ~ year_v, data = df.cmps, FUN = sum)
ag1 <- aggregate(male_num ~ year_v, data = df.cmps, FUN = sum)
ag2 <- aggregate(female_num ~ year_v, data = df.cmps, FUN = sum)
ag3 <- aggregate(unknown_num ~ year_v, data = df.cmps, FUN = sum)

ag_cmps <- left_join(ag, ag1) |> 
  left_join(ag2) |> 
  left_join(ag3)

cmps_tidy <- pivot_longer(ag_cmps, cols = c("author_num":"unknown_num")) |> 
  mutate(name = case_when(name == "male_num" ~ "Male authors",
                          name == "female_num" ~ "Female authors",
                          name == "unknown_num" ~ "Unknown")) |> 
  filter(!is.na(name)) |> 
  mutate(name = factor(name, levels = c("Unknown",
                                        "Male authors",
                                        "Female authors")))
```

Gender balance by year (raw count)

```{r}
ggplot(data = cmps_tidy) + 
  geom_bar(mapping = aes(x = year_v, 
                         y = value, 
                         fill = name), 
           stat = "identity"
           ) + 
  scale_fill_manual("Gender", values = c(col_u, col_m, col_f)) + 
  labs(x = "Year", 
       y = "Number of Authors",
       title = "CMPS") + 
  theme_minimal() + 
  theme(legend.position = "bottom")
```

Gender balance by year (ratio)

```{r}
ggplot(data = cmps_tidy) + 
  geom_bar(mapping = aes(x = year_v, 
                         y = value, 
                         fill = name), 
           stat = "identity",
           position = "fill"
           ) + 
  scale_fill_manual("Gender", values = c(col_u, col_m, col_f)) + 
  labs(x = "Year", 
       y = "Ratio",
       title = "CMPS") + 
  theme_minimal() + 
  theme(legend.position = "bottom")
```

### JPR

Total number of male authors

```{r}
sum(df.jpr $ male_num)
```

Total number of female authors

```{r}
sum(df.jpr $ female_num)
```

Ratio

```{r}
sum(df.jpr$female_num)/sum(df.jpr$author_num)
```

```{r}
ag <- aggregate(author_num ~ year_v, data = df.jpr, FUN = sum)
ag1 <- aggregate(male_num ~ year_v, data = df.jpr, FUN = sum)
ag2 <- aggregate(female_num ~ year_v, data = df.jpr, FUN = sum)
ag3 <- aggregate(unknown_num ~ year_v, data = df.jpr, FUN = sum)

ag_jpr <- left_join(ag, ag1) |> 
  left_join(ag2) |> 
  left_join(ag3)

# Make the dataset tidy.
jpr_tidy <- pivot_longer(ag_jpr, cols = c("author_num":"unknown_num")) |> 
  mutate(name = case_when(name == "male_num" ~ "Male authors",
                          name == "female_num" ~ "Female authors",
                          name == "unknown_num" ~ "Unknown")) |> 
  filter(!is.na(name)) |> 
  mutate(name = factor(name, levels = c("Unknown",
                                        "Male authors",
                                        "Female authors")))
```

Gender balance by year (raw count)

```{r}
ggplot(data = jpr_tidy) + 
  geom_bar(mapping = aes(x = year_v, 
                         y = value, 
                         fill = name), 
           stat = "identity"
           ) + 
  scale_fill_manual("Gender", values = c(col_u, col_m, col_f)) + 
  labs(x = "Year", 
       y = "Number of Authors",
       title = "JPR") + 
  theme_minimal() + 
  theme(legend.position = "bottom")
```

Gender balance by year (ratio)

```{r}
ggplot(data = jpr_tidy) + 
  geom_bar(mapping = aes(x = year_v, 
                         y = value, 
                         fill = name), 
           stat = "identity",
           position = "fill"
           ) + 
  scale_fill_manual("Gender", values = c(col_u, col_m, col_f)) + 
  labs(x = "Year", 
       y = "Ratio",
       title = "JPR") + 
  theme_minimal() + 
  theme(legend.position = "bottom")
```

### JCR

```{r}
# Total number of male authors
sum(df.jcr $ male_num)

#Total number of female authors
sum(df.jcr $ female_num)

# Ratio
sum(df.jcr$female_num)/sum(df.jcr$author_num)

# Aggregate at year level
ag <- aggregate(author_num ~ year_v, data = df.jcr, FUN = sum)
ag1 <- aggregate(male_num ~ year_v, data = df.jcr, FUN = sum)
ag2 <- aggregate(female_num ~ year_v, data = df.jcr, FUN = sum)
ag3 <- aggregate(unknown_num ~ year_v, data = df.jcr, FUN = sum)

ag_jcr <- left_join(ag, ag1) |> 
  left_join(ag2) |> 
  left_join(ag3)

# Make the dataset tidy.
jcr_tidy <- pivot_longer(ag_jcr, cols = c("author_num":"unknown_num")) |> 
  mutate(name = case_when(name == "male_num" ~ "Male authors",
                          name == "female_num" ~ "Female authors",
                          name == "unknown_num" ~ "Unknown")) |> 
  filter(!is.na(name)) |> 
  mutate(name = factor(name, levels = c("Unknown",
                                        "Male authors",
                                        "Female authors")))
```

Gender balance by year (raw count)

```{r}
ggplot(data = jcr_tidy) + 
  geom_bar(mapping = aes(x = year_v, 
                         y = value, 
                         fill = name), 
           stat = "identity"
           ) + 
  scale_fill_manual("Gender", values = c(col_u, col_m, col_f)) + 
  labs(x = "Year", 
       y = "Number of Authors",
       title = "JCR") + 
  theme_minimal() + 
  theme(legend.position = "bottom")
```

Gender balance by year (ratio)

```{r}
ggplot(data = jcr_tidy) + 
  geom_bar(mapping = aes(x = year_v, 
                         y = value, 
                         fill = name), 
           stat = "identity",
           position = "fill"
           ) + 
  scale_fill_manual("Gender", values = c(col_u, col_m, col_f)) + 
  labs(x = "Year", 
       y = "Ratio",
       title = "JCR") + 
  theme_minimal() + 
  theme(legend.position = "bottom")
```

### II

```{r}
# Total number of male authors
sum(df.ii $ male_num)

# Total number of female authors
sum(df.ii $ female_num)

# Ratio
sum(df.ii$female_num)/sum(df.ii$author_num)

# Aggregate at year level
ag <- aggregate(author_num ~ year_v, data = df.ii, FUN = sum)
ag1 <- aggregate(male_num ~ year_v, data = df.ii, FUN = sum)
ag2 <- aggregate(female_num ~ year_v, data = df.ii, FUN = sum)
ag3 <- aggregate(unknown_num ~ year_v, data = df.ii, FUN = sum)

ag_ii <- left_join(ag, ag1) |> 
  left_join(ag2) |> 
  left_join(ag3)

# Make the dataset tidy.
ii_tidy <- pivot_longer(ag_ii, cols = c("author_num":"unknown_num")) |> 
  mutate(name = case_when(name == "male_num" ~ "Male authors",
                          name == "female_num" ~ "Female authors",
                          name == "unknown_num" ~ "Unknown")) |> 
  filter(!is.na(name)) |> 
  mutate(name = factor(name, levels = c("Unknown",
                                        "Male authors",
                                        "Female authors")))
```

Gender balance by year (raw count)

```{r}
ggplot(data = ii_tidy) + 
  geom_bar(mapping = aes(x = year_v, 
                         y = value, 
                         fill = name), 
           stat = "identity"
           ) + 
  scale_fill_manual("Gender", values = c(col_u, col_m, col_f)) + 
  labs(x = "Year", 
       y = "Number of Authors",
       title = "II") + 
  theme_minimal() + 
  theme(legend.position = "bottom")
```


Gender balance by year (ratio)

```{r}
ggplot(data = ii_tidy) + 
  geom_bar(mapping = aes(x = year_v, 
                         y = value, 
                         fill = name), 
           stat = "identity",
           position = "fill"
           ) + 
  scale_fill_manual("Gender", values = c(col_u, col_m, col_f)) + 
  labs(x = "Year", 
       y = "Ratio",
       title = "II") + 
  theme_minimal() + 
  theme(legend.position = "bottom")
```


### ISQ

```{r}
# Total number of male authors
sum(df.isq $ male_num)

# Total number of female authors
sum(df.isq $ female_num)

# Ratio
sum(df.isq$female_num)/sum(df.isq$author_num)

# Aggregate at year level
ag <- aggregate(author_num ~ year_v, data = df.isq, FUN = sum)
ag1 <- aggregate(male_num ~ year_v, data = df.isq, FUN = sum)
ag2 <- aggregate(female_num ~ year_v, data = df.isq, FUN = sum)
ag3 <- aggregate(unknown_num ~ year_v, data = df.isq, FUN = sum)

ag_isq <- left_join(ag, ag1) |> 
  left_join(ag2) |> 
  left_join(ag3)

# Make the dataset tidy.
isq_tidy <- pivot_longer(ag_isq, cols = c("author_num":"unknown_num")) |> 
  mutate(name = case_when(name == "male_num" ~ "Male authors",
                          name == "female_num" ~ "Female authors",
                          name == "unknown_num" ~ "Unknown")) |> 
  filter(!is.na(name)) |> 
  mutate(name = factor(name, levels = c("Unknown",
                                        "Male authors",
                                        "Female authors")))
```

Gender balance by year (raw count)

```{r}
ggplot(data = isq_tidy) + 
  geom_bar(mapping = aes(x = year_v, 
                         y = value, 
                         fill = name), 
           stat = "identity"
           ) + 
  scale_fill_manual("Gender", values = c(col_u, col_m, col_f)) + 
  labs(x = "Year", 
       y = "Number of Authors",
       title = "ISQ") + 
  theme_minimal() + 
  theme(legend.position = "bottom")
```

Gender balance by year (ratio)

```{r}
ggplot(data = isq_tidy) + 
  geom_bar(mapping = aes(x = year_v, 
                         y = value, 
                         fill = name), 
           stat = "identity",
           position = "fill"
           ) + 
  scale_fill_manual("Gender", values = c(col_u, col_m, col_f)) + 
  labs(x = "Year", 
       y = "Ratio",
       title = "ISQ") + 
  theme_minimal() + 
  theme(legend.position = "bottom")
```

### Aggregate

```{r}
# Total number of male authors
sum(df_all $ male_num)

# Total number of female authors
sum(df_all $ female_num)

# Ratio
sum(df_all$female_num)/sum(df_all$author_num)

# Aggregate at year level
ag <- aggregate(author_num ~ year_v, data = df_all, FUN = sum)
ag1 <- aggregate(male_num ~ year_v, data = df_all, FUN = sum)
ag2 <- aggregate(female_num ~ year_v, data = df_all, FUN = sum)
ag3 <- aggregate(unknown_num ~ year_v, data = df_all, FUN = sum)

ag_all <- left_join(ag, ag1) |> 
  left_join(ag2) |> 
  left_join(ag3)

# Make the dataset tidy.
all_tidy <- pivot_longer(ag_all, cols = c("author_num":"unknown_num")) |> 
  mutate(name = case_when(name == "male_num" ~ "Male authors",
                          name == "female_num" ~ "Female authors",
                          name == "unknown_num" ~ "Unknown")) |> 
  filter(!is.na(name)) |> 
  mutate(name = factor(name, levels = c("Unknown",
                                        "Male authors",
                                        "Female authors")))
```

```{r}
ggplot(data = all_tidy) + 
  geom_bar(mapping = aes(x = year_v, 
                         y = value, 
                         fill = name), 
           stat = "identity",
           position = "fill"
           ) + 
  scale_fill_manual("Gender", values = c(col_u, col_m, col_f)) + 
  labs(x = "Year", 
       y = "Ratio",
       title = "All journals") + 
  theme_minimal() + 
  theme(legend.position = "bottom")
```

## Summary of Results

```{r}
per_fem_cmps <- round(sum(df.cmps$female_num)/sum(df.cmps$author_num), 3)
per_fem_ii <- round(sum(df.ii$female_num)/sum(df.ii$author_num), 3)
per_fem_isq <- round(sum(df.isq$female_num)/sum(df.isq$author_num), 3)
per_fem_jcr <- round(sum(df.jcr$female_num)/sum(df.jcr$author_num), 3)
per_fem_jpr <- round(sum(df.jpr$female_num)/sum(df.jpr$author_num), 3)
per_fem_jpr <- round(sum(df.jpr$female_num)/sum(df.jpr$author_num), 3)
per_fem_all <- round(sum(df_all$female_num)/sum(df_all$author_num), 3)

six_tidy <- bind_rows(
  cmps_tidy |> mutate(journal = "CMPS"),
  ii_tidy |> mutate(journal = "II"),
  isq_tidy |> mutate(journal = "ISQ"),
  jcr_tidy |> mutate(journal = "JCR"),
  jpr_tidy |> mutate(journal = "JPR"),
  all_tidy |> mutate(journal = "All")
  ) |> 
  mutate(journal = factor(journal, 
                          levels = c("All", "ISQ", "JCR", 
                                     "JPR", "CMPS", "II")))

with(six_tidy, table(journal))
```

```{r}
ggplot(data = six_tidy) + 
  geom_bar(mapping = aes(x = year_v, 
                         y = value, 
                         fill = name), 
           stat = "identity",
           position = "fill"
           ) + 
  scale_fill_manual("Gender", values = c(col_u, col_m, col_f)) + 
  labs(x = "Year", 
       y = "Ratio",
       title = "Gender ratio for conflict research journals") + 
  theme_minimal() + 
  facet_wrap(~ journal, ncol = 3) + 
  theme(legend.position = "bottom", 
        axis.text.x = element_text(angle = 90))
```

## 2000-2015

```{r}
six_tidy |> 
  filter(name != "Unknown") |> 
  group_by(journal) |> 
  summarise(total = sum(value),
            total_female = sum(value[name == "Female authors"]),
            percent_female = total_female/total * 100) |> 
  dplyr::select(journal, percent_female)
```

year <= 2015

```{r}
six_tidy |> 
  filter(year_v <= 2015) |> 
  filter(name != "Unknown") |> 
  group_by(journal) |> 
  summarise(total = sum(value),
            total_female = sum(value[name == "Female authors"]),
            percent_female = total_female/total * 100) |> 
  dplyr::select(journal, percent_female)

```

## Five-Year Bin

```{r}
six_tidy <- six_tidy |> 
  mutate(
    interval = case_when(
      year_v >= 2000 & year_v < 2005 ~ "2000-\n 2004",
      year_v >= 2005 & year_v < 2010 ~ "2005-\n 2009",
      year_v >= 2010 & year_v < 2015 ~ "2010-\n 2014",
      year_v >= 2015 & year_v < 2020 ~ "2015-\n 2019",
      year_v >= 2020 ~ "2020-\n 2024"
    )
  )

six_five <- six_tidy |> 
  filter(name != "Unknown") |> 
  group_by(interval, journal, name) |> 
  summarise(value = sum(value)) |> 
  mutate(name = factor(name, levels = c("Male authors", 
                                        "Female authors")))

ggplot(data = six_five) + 
  geom_bar(mapping = aes(x = interval, 
                         y = value, 
                         fill = name), 
           stat = "identity",
           position = "fill"
           ) + 
  scale_fill_viridis_d(direction = -1) + 
  labs(x = "Year of publication", 
       y = "Ratio") + 
  theme_minimal() + 
  facet_wrap(~ journal, ncol = 3) + 
  theme(legend.position = "bottom", 
        axis.text.x = element_text(angle = 45))
```


## Figure 1 in the paper

```{r}
plot_data <- six_five |> 
  group_by(journal, interval) |> 
  mutate(total = sum(value), 
         percent_female = value[name == "Female authors"] / total * 100)

ggplot(data = plot_data) + 
  geom_bar(mapping = aes(x = interval, 
                         y = value, 
                         fill = name), 
           stat = "identity",
           position = "fill"
           ) + 
  geom_text(
    aes(x = interval,
        label = ifelse(name == "Female authors", paste0(round(percent_female, 1), "%"), ""),
        y = 0.05 + percent_female/100),
    color = "black", size = 3, vjust = 0
    ) + 
  scale_fill_viridis_d(direction = -1) + 
  scale_y_continuous(labels = scales::percent) + 
  labs(x = "Year of publication", 
       fill = NULL,
       y = NULL) + 
  theme_minimal() + 
  facet_wrap(~ journal, ncol = 3) + 
  theme(legend.position = "top", 
        axis.text.x = element_text(angle = 0, size = 9),
        axis.text.y = element_text(size = 11),
        strip.text = element_text(size = 15),
        axis.title = element_text(size = 17),
        legend.title = element_text(size = 17),
        legend.text = element_text(size = 15))
```

